

* +++++++++++++++++++++
* CLEAN POPULATION COUNT DATA
* +++++++++++++++++++++

* globals
global data_raw_pop "${data_raw}/population/"

* population by age and gender
import delimited "${data_raw_pop}/12411-0018.csv", clear delimiter(";") bindquote(strict)

ren (v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 ///
	v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23 v24 v25 v26 v27 v28 v29 v30 ///
	v31 v32 v33 v34 v35 v36 v37) ///
	(year ags place_name ///
	pop_m_0_2 pop_m_3_5 pop_m_6_9 pop_m_10_14 pop_m_15_17 pop_m_18_19 ///
	pop_m_20_24 pop_m_25_29 pop_m_30_34 pop_m_35_39 pop_m_40_44 pop_m_45_49 /// 
	pop_m_50_54 pop_m_55_59 pop_m_60_64 pop_m_65_74 pop_m_75_plus ///
	pop_f_0_2 pop_f_3_5 pop_f_6_9 pop_f_10_14 pop_f_15_17 pop_f_18_19 ///
	pop_f_20_24 pop_f_25_29 pop_f_30_34 pop_f_35_39 pop_f_40_44 pop_f_45_49 /// 
	pop_f_50_54 pop_f_55_59 pop_f_60_64 pop_f_65_74 pop_f_75_plus)
	
* focus on data	
drop if _n <=6
drop if _n >=4761

replace year = substr(year,7,.)
destring year pop*, replace force

* make sure balanced panel	
bys ags : gen N = _N
su N, d
assert r(sd) ==0	
drop N 

* compute totals over age (by gender and for everyone)
foreach g in m f {
	egen pop_`g'_tot = rowtotal(pop_`g'_*)
}
gen pop_tot = pop_m_tot + pop_f_tot

bys ags : egen check = total(pop_tot)
drop if check ==0 
drop check

* missing should be zeros 
foreach var of varlist pop*tot {
	replace `var' = . if `var' ==0
}

tempfile pop_by_age_gender_kreis
save `pop_by_age_gender_kreis'
 
* foreigners: people from Syria (could easily get other countries from same source)
import delimited "${data_raw_pop}/12521-0041.csv", clear delimiter(";") bindquote(strict) maxquotedrows(500) 

* missing should be zeros
foreach var of varlist v5-v7 {
	replace `var'= "0" if `var' =="-"
}

destring v5-v7, replace force
	
ren v1 year
replace year = substr(year,7,.)
destring year, replace force

ren v2 ags
ren v3 place_name 

drop if _n<=6 | _n >=9527
reshape wide v5-v7, i(year ags) j(v4) string

foreach nat in for syr {
	if "`nat'" == "for"		local suff "Insgesamt"
	if "`nat'" == "syr"		local suff "Syrien"
	gen pop_`nat'_m = v5`suff'
	gen pop_`nat'_f = v6`suff' 	
	gen pop_`nat'_tot = v7`suff'
}

drop v*

* combine with data on full pop
merge 1:1 year ags using `pop_by_age_gender_kreis', assert(1 3) nogen keep(3)

* calculate fractions (where possible)
foreach nat in for syr {
	gen frac_`nat'_tot = pop_`nat'_tot / pop_tot
	foreach g in m f {
		gen frac_`nat'_`g' = pop_`nat'_`g' / pop_`g'_tot
	}
}

* create log fractions
foreach var of varlist frac_* {
	gen log_`var' = log(`var')
}

order year ags place_name frac_*, first

* drop places that are not in other data source (also from Statistisches Bundesamt)
* --> there clearly seems to be something
* weird going on in those places so we drop them
ds 
local varlist `r(varlist)'

preserve
	import delimited /// 
		"${data_raw_pop}/migration_integration_regionen/migration_integration_regionen_daten.csv" , clear
	ren rs ags
	tempfile oth_data_source
	save `oth_data_source'
restore

merge m:1 ags using `oth_data_source', keep(3) assert(1 3) nogen

gen missing_in_oth_data = azr_syrien ==.
foreach var in frac_syr_tot frac_syr_m frac_syr_f pop_syr_m pop_syr_f pop_syr_tot {
	replace `var' =. if missing_in_oth_data ==1
}

keep `varlist'

* construct average age
* assume linear within bin
foreach age in 0_2 3_5 6_9 10_14 15_17 18_19 ///
	20_24 25_29 30_34 35_39 40_44 45_49 /// 
	50_54 55_59 60_64 65_74 75_plus {
		gen pop_`age' = pop_m_`age' + pop_f_`age' 
	}
gen avg_age= (1*pop_0_2 + 4*pop_3_5 + 7.5*pop_6_9 + ///
	12*pop_10_14 + 16*pop_15_17 + 18.5*pop_18_19 + 22* pop_20_24 + ///
	27*pop_25_29 + 32*pop_30_34 + 37*pop_35_39 + 42*pop_40_44 + 47*pop_45_49 + ///
	52*pop_50_54 + 57*pop_55_59 + 62*pop_60_64 + 69.5* pop_65_74 + 81*pop_75_plus) / pop_tot
	
* share female
gen share_female = pop_f_tot / pop_tot	
	
* save
save "${data_derived}/kreis_total_pop_by_age_gender.dta", replace
